/*
********************************************************************************
Description:		Import, clean and append Valid View datasets

Uses:				"${VVindata}\EY`year'.csv"
					"${IVindata}\inv`yr'.csv"

Saves:				"${VVoutdata}\ValidView`year'.dta"
					"${VVoutdata}\InvalidView`year'.dta"
					"${VVoutdata}\ValidView_1997_2012.dta"
					
********************************************************************************
*/

/*
For a case to be in VV it needs to have:
- A valid postcode signal of 'Y' (in valid_po), or
- An abroad signal of 'Y' (in txp_add)


Variables we need

tax_year		tax year
utr_no			UTR
txp_add_		Indicates if taxpayer's base address is abroad				
gorcode			Government Office Region
agent_in		Indicates if the taxpayer has an agent
age				Age
sex				Sex
deceased		Deceased
ptnr_no			Number of partner schedules
trad_no			Number of trade (self-employment) schedules
employ_c		Number of employment schedules
close_co		Flag for close company
const_in		Construction industry flag
it_aft_a		Total Income Tax due after allowances and reliefs
txp_type		Individual or partnership indicator
valid_po		Valid postcode signal
retrec_l		Latest return received date
tcn_tsa_		Trade classification number of trade A
tcn_tsb_		Trade classification number of trade B
tcn_psa_		Trade classification number of partner A
tcn_psb_		Trade classification number of partner B

*/





******************************************************************
* cleanVV: import, clean and append  Valid View across all years *
******************************************************************

* Typical usage
	* cleanVV, saving("${VVoutdata}\ValidView_1997_2012.dta")
	
capture program drop cleanVV
program define cleanVV

	syntax, saving(string)
	
	set more off
	
	if regexm("`saving'", "(.*)\.dta") local saving = regexs(1)
	
	
	* First import the main files 
	*******************************
	*******************************
	

	* Import, clean and save data for each year
	*******************************************
	
	forvalues year = 1997/2012 {
	
		* Main Valid View datasets
		*-------------------------

		di as text "Importing Valid View `year' (`c(current_time)')"
		importVV using "${VVindata}\EY`year'.csv", saving("${VVoutdata}\ValidView`year'.dta")

		* Main Invalid View datsets
		*--------------------------
		
		di as text "Importing Invalid View `year' (`c(current_time)')"
		* For some unknown reason, the delimiters in the 2000 Invalid View file are spaces rather than commas
		local delimiters ""
		if (`year' == 2000) local delimiters `"delimiters(" ")"'
		local yr = substr("`year'",3,2)
		importVV using "${IVindata}\inv`yr'.csv", `delimiters' saving("${VVoutdata}\InvalidView`year'.dta")
		
	}

	
	
	* Append
	********

	local appendtxt ""
	forvalues year = 1997/2012 {
	
		local appendtxt `"`appendtxt' "${VVoutdata}\ValidView`year'.dta""'
		local appendtxt `"`appendtxt' "${VVoutdata}\InvalidView`year'.dta""'
		
	}

	* Append (onto blank dataset)
	clear
	quietly append using `appendtxt'

	

	* Delete observations that are duplicated (appear in both VV and IV)
	********************************************************************

	* Keep only the duplicate cases
	sort utr_no tax_year
	by utr_no tax_year: gen byte count = _N
	preserve
	quietly keep if count > 1
	drop count

	* Copy information across from IV cases if it is missing in VV
	quietly ds utr_no tax_year validview, not 
	foreach var in `r(varlist)' {
		local vartype : type `var'
		if (substr("`vartype'",1,3) == "str") gen byte missing = (`var' == "")
		else gen byte missing = (`var' >= .)
		quietly bysort utr_no tax_year (missing `var'): gen `vartype' temp = `var'[1]
		if (substr("`vartype'",1,3) == "str") gen byte tempmissing = (temp == "")
		else gen byte tempmissing = (temp >= .)
		quietly replace `var' = temp if (validview == 1 & missing & !tempmissing)
		drop missing temp tempmissing 
	}

	* Drop IV cases
	drop if (validview == 0)
	
	* Save as temporary file
	tempfile temp
	qui save `temp'

	* Append onto full VV/IV
	restore
	quietly keep if count == 1
	drop count
	append using `temp'

	
	* Tidy up female and age
	************************
		
	* Female
	egen meanfemale = mean(female), by(utr_no)
	qui replace meanfemale = 0 if (meanfemale > 0 & meanfemale < 0.5)
	qui replace meanfemale = 1 if (meanfemale >= 0.5 & meanfemale < 1)
	qui replace female = meanfemale
	drop meanfemale
	
	* Age
	qui gen int agenew = age
	qui bysort utr_no (tax_year): replace agenew = agenew[_n-1] + (tax_year - tax_year[_n-1]) if (agenew >= .) & (agenew[_n-1] < .) & (_n > 1)
	gen int tax_year_neg = -tax_year
	qui bysort utr_no (tax_year_neg): replace agenew = agenew[_n-1] + (tax_year - tax_year[_n-1]) if (agenew >= .) & (agenew[_n-1] < .) & (_n > 1)
	qui replace age = agenew
	drop agenew tax_year_neg
	sort utr_no tax_year
	
	
	* Save
	******
	
	quietly compress
	sort utr_no tax_year
	save "`saving'.dta", replace


end



****************************************************************
* importVV: import and clean Valid View dataset for given year *
****************************************************************

* Example usage
	* importVV using "${VVindata}\EY1999.csv"
	
capture program drop importVV
program define importVV


	syntax using/, [delimiters(string asis)] saving(string)

	
	* Work out whether it's Valid View or Invalid View
	if regexm("`using'","EY") local validview = 1
	else if regexm("`using'","inv") local validview = 0
	else {
		di as error "Couldn't work out whether dataset is Valid View or Invalid View"
		exit
	}
	
	
	* Import the data
	*****************

	quietly import delimited using "`using'", delimiters(`delimiters') clear

	

	* Tidy the data
	***************
	
	
	* Variable checks
	*----------------
	
	* Check that the data contain only one value of tax_year
	qui su tax_year
	assert (r(min)==r(max))	
	
	* Check that observations are uniquely identified by utr_no
	capture rename anon_utr utr_no
	capture rename utref utr_no
	capture rename utref_anon utr_no
	isid utr_no

	* Remove suffix "_" from some variables
	rename *_ *


	
	* Keep variables
	*---------------

	* This is the full list of variables we are interested in
	
	# delimit ;
	local tokeep "tax_year utr_no txp_add gorcode agent_in age sex 
		deceased ptnr_no it_aft_a txp_type valid_po
		retcap_f retcap_l retisd_l retrec_l fil_hdbu
		tcn_tsa tcn_tsb tcn_psa tcn_psb"
		;
	# delimit cr
	
	unab allvars : _all
	local tokeep : list tokeep & allvars
	
	keep `tokeep'


	* Create any variables that are missing
	*--------------------------------------
	
	* Not all variables exist in every year in both VV and IV
	
	* In IV
		* txp_add_ (taxpayer address abroad) only available in 1997
		* it_aft_a (income tax after allowances) not available in 2006
		* gorcode (government office region) not available in 2011
	

	* byte variables
	foreach var in ptnr_no agent_in sex gorcode {
		capture confirm variable `var'
		if _rc quietly gen byte `var' = .z
	}
	
	* int variables
	foreach var in age tcn_tsa tcn_tsb tcn_psa tcn_psb {
		capture confirm variable `var'
		if _rc quietly gen int `var' = .z
	}
	
	* float variables
	foreach var in it_aft_a {
		capture confirm variable `var'
		if _rc quietly gen float `var' = .z
	}
		
	* string variables
	foreach var in txp_type txp_add deceased extract retcap_f retcap_l retisd_l retrec_l fil_hdbu valid_po {
		capture confirm variable `var'
		if _rc quietly gen str1 `var' = ""
	}
	

	* Create new variables
	*---------------------
	
	* Female indicator
	quietly gen byte female = sex - 1
	quietly recode female (2=.)
	drop sex
	
	* Clean age
	quietly recode age (0=.) (150/1000=.)
	
	* Encode a series of Y/N variables to 0/1
	foreach var in txp_add deceased valid_po {
		rename `var' temp
		quietly gen byte `var' = .
		quietly replace `var' = 0 if temp == "N"
		quietly replace `var' = 1 if inlist(temp,"Y","S")
		drop temp
	}
	

	* Encode date variables
	* (Pre-2007 they are all missing so are imported as byte variables. For these years we make them string variables first)
	foreach var in extract retcap_f retcap_l retisd_l retrec_l fil_hdbu {
		capture confirm string variable `var'
		if _rc quietly tostring `var', replace
		rename `var' temp
		quietly gen long `var' = date(temp, "DMY")
		format `var' %tdDD/NN/CCYY
		drop temp
	}

	
	* Sort out gorcode
	* (In VV 2011 and 2012 and IV 2012, gorcode is not 1-12 but takes values such as E12000007)
	capture confirm string variable gorcode
	if !_rc {
		rename gorcode temp
		quietly gen byte gorcode = .
		quietly replace gorcode = 1  if (temp == "E12000001")
		quietly replace gorcode = 2  if (temp == "E12000002")
		quietly replace gorcode = 3  if (temp == "E12000003")
		quietly replace gorcode = 4  if (temp == "E12000004")
		quietly replace gorcode = 5  if (temp == "E12000005")
		quietly replace gorcode = 6  if (temp == "E12000006")
		quietly replace gorcode = 7  if (temp == "E12000007")
		quietly replace gorcode = 8  if (temp == "E12000008")
		quietly replace gorcode = 9  if (temp == "E12000009")
		quietly replace gorcode = 10 if (temp == "W99999999")
		quietly replace gorcode = 11 if (temp == "S99999999")
		quietly replace gorcode = 12 if (temp == "N99999999")
		quietly replace gorcode = 13 if (temp == "L99999999")
		quietly replace gorcode = 14 if (temp == "M99999999")
		capture assert ((gorcode == .) - (temp == "") == 0)
		if _rc {
			di as error "gorcode takes on other values"
			exit
		}
		drop temp
	}
	
	* Sort out trade classification numbers (TCNs)
	* Sometimes these are string variables because they contain either "none" or ""
	foreach var of varlist tcn_tsa tcn_tsb tcn_psa tcn_psb {

		capture confirm string variable `var'
		
		* If string variable
		if (!_rc) {
			qui replace `var' = "0000" if inlist(`var', "none", "")
			qui destring `var', replace
		}
		
		* If numeric variable
		else {
			* Recode missing to zero if numeric
			qui replace `var' = 0 if (`var' == .)
		}
		
		assert (`var' < .)
		
	}
	
	
	* Create an identifier for being in Valid View (rather than Invalid View)
	gen byte validview = `validview'
	label variable validview "Obs from VV"
	
	
	
	* Label variables
	*----------------
	
	label variable tax_year 	"tax year"
	label variable utr_no 		"UTR"
	label variable txp_add 		"Indicates if taxpayer's base address is abroad"
	label variable gorcode 		"Government Office Region"
	label variable agent_in		"Indicates if the taxpayer has an agent"
	label variable age 			"Age"
	label variable female 		"Female indicator"
	label variable deceased 	"Deceased"
	label variable ptnr_no 		"Count of number of partner schedules"
	label variable it_aft_a 	"Total Income Tax due after allowances and reliefs"
	label variable txp_type 	"Individual or partnership indicator"
	label variable extract 		"Date of extract from Telford"
	label variable valid_po 	"Valid postcode signal"
	label variable retrec_l 	"Latest return received date"
	label variable retcap_f 	"Return capture date first"	
	label variable retcap_l 	"Return capture date last"	
	label variable retisd_l 	"Latest return issue date"	
	label variable fil_hdbu 	"Date filing info last updated"	
	label variable tcn_tsa		"Trade classification number of trade A"
	label variable tcn_tsb		"Trade classification number of trade B"
	label variable tcn_psa		"Trade classification number of partner A"
	label variable tcn_psb		"Trade classification number of partner B"
	  
	# delimit ;
	label define gorcode 1 "North East" 2 "North West" 3 "Yorkshire and The Humber"
		4 "East Midlands" 5 "West Midlands" 6 "East of England" 7 "London" 8 "South East" 
		9 "South West" 10 "Wales" 11 "Scotland" 12 "Northern Ireland" 13 "Channel Islands"
		14 "Isle of Man"
		;
	# delimit cr
	label values gorcode gorcode
	

	* Compress, sort and save
	*------------------------

	quietly compress
	sort utr_no tax_year
	quietly save "`saving'", replace

end


exit

